# import all packages and set plots to be embedded inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from ydata_profiling import ProfileReport
sns.set()
Load in your dataset and describe its properties through the questions below. Try and motivate your exploration goals through this section.
bike=pd.read_csv("fordgobike-tripdata.csv")
bike.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null object 2 end_time 183412 non-null object 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 start_station_latitude 183412 non-null float64 6 start_station_longitude 183412 non-null float64 7 end_station_id 183215 non-null float64 8 end_station_name 183215 non-null object 9 end_station_latitude 183412 non-null float64 10 end_station_longitude 183412 non-null float64 11 bike_id 183412 non-null int64 12 user_type 183412 non-null object 13 member_birth_year 175147 non-null float64 14 member_gender 175147 non-null object 15 bike_share_for_all_trip 183412 non-null object dtypes: float64(7), int64(2), object(7) memory usage: 22.4+ MB
bike.shape
(183412, 16)
bike.sample(10)
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 102036 | 286 | 2019-02-14 17:02:21.2090 | 2019-02-14 17:07:07.7280 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 217.0 | 27th St at MLK Jr Way | 37.817015 | -122.271761 | 185 | Subscriber | 1988.0 | Male | No |
| 24661 | 333 | 2019-02-25 17:58:49.3420 | 2019-02-25 18:04:22.8250 | 15.0 | San Francisco Ferry Building (Harry Bridges Pl... | 37.795392 | -122.394203 | 6.0 | The Embarcadero at Sansome St | 37.804770 | -122.403234 | 6051 | Subscriber | 1984.0 | Female | No |
| 57857 | 327 | 2019-02-21 07:53:26.9930 | 2019-02-21 07:58:54.8480 | 108.0 | 16th St Mission BART | 37.764710 | -122.419957 | 123.0 | Folsom St at 19th St | 37.760594 | -122.414817 | 5846 | Subscriber | 1980.0 | Female | No |
| 55068 | 502 | 2019-02-21 11:08:53.4140 | 2019-02-21 11:17:15.5420 | 369.0 | Hyde St at Post St | 37.787349 | -122.416651 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 2664 | Subscriber | 1988.0 | Male | No |
| 47381 | 432 | 2019-02-22 08:50:39.7830 | 2019-02-22 08:57:52.1990 | 36.0 | Folsom St at 3rd St | 37.783830 | -122.398870 | 15.0 | San Francisco Ferry Building (Harry Bridges Pl... | 37.795392 | -122.394203 | 5344 | Subscriber | 1971.0 | Male | No |
| 110231 | 723 | 2019-02-12 18:05:20.5450 | 2019-02-12 18:17:24.2270 | 160.0 | West Oakland BART Station | 37.805318 | -122.294837 | 159.0 | 24th St at Market St | 37.816060 | -122.278244 | 3638 | Subscriber | 1988.0 | Male | No |
| 62977 | 597 | 2019-02-20 16:06:01.6930 | 2019-02-20 16:15:58.9940 | 104.0 | 4th St at 16th St | 37.767045 | -122.390833 | 27.0 | Beale St at Harrison St | 37.788059 | -122.391865 | 5967 | Subscriber | 1966.0 | Female | No |
| 163656 | 447 | 2019-02-05 09:06:13.4330 | 2019-02-05 09:13:40.5750 | 36.0 | Folsom St at 3rd St | 37.783830 | -122.398870 | 17.0 | Embarcadero BART Station (Beale St at Market St) | 37.792251 | -122.397086 | 5007 | Subscriber | 1980.0 | Male | No |
| 21490 | 384 | 2019-02-26 09:12:13.1430 | 2019-02-26 09:18:38.0420 | 66.0 | 3rd St at Townsend St | 37.778742 | -122.392741 | 19.0 | Post St at Kearny St | 37.788975 | -122.403452 | 5489 | Subscriber | 1980.0 | Male | No |
| 164076 | 765 | 2019-02-05 08:39:42.4930 | 2019-02-05 08:52:28.0540 | 22.0 | Howard St at Beale St | 37.789756 | -122.394643 | 88.0 | 11th St at Bryant St | 37.770030 | -122.411726 | 4981 | Subscriber | 1983.0 | Male | No |
bike.duplicated().sum()
0
bike.describe()
| duration_sec | start_station_id | start_station_latitude | start_station_longitude | end_station_id | end_station_latitude | end_station_longitude | bike_id | member_birth_year | |
|---|---|---|---|---|---|---|---|---|---|
| count | 183412.000000 | 183215.000000 | 183412.000000 | 183412.000000 | 183215.000000 | 183412.000000 | 183412.000000 | 183412.000000 | 175147.000000 |
| mean | 726.078435 | 138.590427 | 37.771223 | -122.352664 | 136.249123 | 37.771427 | -122.352250 | 4472.906375 | 1984.806437 |
| std | 1794.389780 | 111.778864 | 0.099581 | 0.117097 | 111.515131 | 0.099490 | 0.116673 | 1664.383394 | 10.116689 |
| min | 61.000000 | 3.000000 | 37.317298 | -122.453704 | 3.000000 | 37.317298 | -122.453704 | 11.000000 | 1878.000000 |
| 25% | 325.000000 | 47.000000 | 37.770083 | -122.412408 | 44.000000 | 37.770407 | -122.411726 | 3777.000000 | 1980.000000 |
| 50% | 514.000000 | 104.000000 | 37.780760 | -122.398285 | 100.000000 | 37.781010 | -122.398279 | 4958.000000 | 1987.000000 |
| 75% | 796.000000 | 239.000000 | 37.797280 | -122.286533 | 235.000000 | 37.797320 | -122.288045 | 5502.000000 | 1992.000000 |
| max | 85444.000000 | 398.000000 | 37.880222 | -121.874119 | 398.000000 | 37.880222 | -121.874119 | 6645.000000 | 2001.000000 |
bike.dtypes
duration_sec int64 start_time object end_time object start_station_id float64 start_station_name object start_station_latitude float64 start_station_longitude float64 end_station_id float64 end_station_name object end_station_latitude float64 end_station_longitude float64 bike_id int64 user_type object member_birth_year float64 member_gender object bike_share_for_all_trip object dtype: object
bike.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_id 197 start_station_name 197 start_station_latitude 0 start_station_longitude 0 end_station_id 197 end_station_name 197 end_station_latitude 0 end_station_longitude 0 bike_id 0 user_type 0 member_birth_year 8265 member_gender 8265 bike_share_for_all_trip 0 dtype: int64
bike.columns
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'start_station_latitude',
'start_station_longitude', 'end_station_id', 'end_station_name',
'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
'member_birth_year', 'member_gender', 'bike_share_for_all_trip'],
dtype='object')
bike['bike_share_for_all_trip'].value_counts()
No 166053 Yes 17359 Name: bike_share_for_all_trip, dtype: int64
bike['user_type'].value_counts()
Subscriber 163544 Customer 19868 Name: user_type, dtype: int64
bike['member_gender'].value_counts()
Male 130651 Female 40844 Other 3652 Name: member_gender, dtype: int64
bike['member_birth_year'].value_counts()
1988.0 10236
1993.0 9325
1989.0 8972
1990.0 8658
1991.0 8498
...
1928.0 1
1878.0 1
1930.0 1
1910.0 1
1927.0 1
Name: member_birth_year, Length: 75, dtype: int64
bike['start_station_name'].value_counts()
Market St at 10th St 3904
San Francisco Caltrain Station 2 (Townsend St at 4th St) 3544
Berry St at 4th St 3052
Montgomery St BART Station (Market St at 2nd St) 2895
Powell St BART Station (Market St at 4th St) 2760
...
Willow St at Vine St 9
Parker Ave at McAllister St 7
Palm St at Willow St 4
21st Ave at International Blvd 4
16th St Depot 2
Name: start_station_name, Length: 329, dtype: int64
bike['end_station_name'].value_counts()
San Francisco Caltrain Station 2 (Townsend St at 4th St) 4857
Market St at 10th St 3973
Montgomery St BART Station (Market St at 2nd St) 3647
San Francisco Ferry Building (Harry Bridges Plaza) 3368
Powell St BART Station (Market St at 4th St) 2997
...
Parker Ave at McAllister St 9
Palm St at Willow St 7
16th St Depot 6
21st Ave at International Blvd 6
Willow St at Vine St 5
Name: end_station_name, Length: 329, dtype: int64
## make a copy of dataframe
bike_19=bike.copy()
## to change data type datetime
for col in bike_19.columns:
if "time" in col:
bike_19[col]=pd.to_datetime(bike_19[col])
## to change data type
for cols in bike_19.columns:
if "id" in cols:
bike_19[cols]=bike_19[cols].astype(str)
bike_19.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_id 0 start_station_name 197 start_station_latitude 0 start_station_longitude 0 end_station_id 0 end_station_name 197 end_station_latitude 0 end_station_longitude 0 bike_id 0 user_type 0 member_birth_year 8265 member_gender 8265 bike_share_for_all_trip 0 dtype: int64
bike_19.columns
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'start_station_latitude',
'start_station_longitude', 'end_station_id', 'end_station_name',
'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
'member_birth_year', 'member_gender', 'bike_share_for_all_trip'],
dtype='object')
## remove unnessary columns such (start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude ,start_station_id,end_station_id )
bike_19.drop(['start_station_id','start_station_latitude','start_station_longitude','end_station_id'], axis=1, inplace=True)
bike_19.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null datetime64[ns] 2 end_time 183412 non-null datetime64[ns] 3 start_station_name 183215 non-null object 4 end_station_name 183215 non-null object 5 end_station_latitude 183412 non-null float64 6 end_station_longitude 183412 non-null float64 7 bike_id 183412 non-null object 8 user_type 183412 non-null object 9 member_birth_year 175147 non-null float64 10 member_gender 175147 non-null object 11 bike_share_for_all_trip 183412 non-null object dtypes: datetime64[ns](2), float64(3), int64(1), object(6) memory usage: 16.8+ MB
bike_19.drop(['end_station_latitude','end_station_longitude'], axis=1, inplace=True)
bike_19.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null datetime64[ns] 2 end_time 183412 non-null datetime64[ns] 3 start_station_name 183215 non-null object 4 end_station_name 183215 non-null object 5 bike_id 183412 non-null object 6 user_type 183412 non-null object 7 member_birth_year 175147 non-null float64 8 member_gender 175147 non-null object 9 bike_share_for_all_trip 183412 non-null object dtypes: datetime64[ns](2), float64(1), int64(1), object(6) memory usage: 14.0+ MB
## add new columns duration by minute
bike_19['duration_min']=bike_19['duration_sec']/60
bike_19.sample(5)
| duration_sec | start_time | end_time | start_station_name | end_station_name | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | duration_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 65239 | 188 | 2019-02-20 09:58:32.054 | 2019-02-20 10:01:41.031 | Post St at Kearny St | Montgomery St BART Station (Market St at 2nd St) | 6595 | Subscriber | 1971.0 | Male | No | 3.133333 |
| 5074 | 392 | 2019-02-28 14:08:18.337 | 2019-02-28 14:14:50.668 | The Embarcadero at Sansome St | Davis St at Jackson St | 5031 | Subscriber | 1989.0 | Female | No | 6.533333 |
| 167804 | 699 | 2019-02-04 16:45:05.532 | 2019-02-04 16:56:44.867 | 4th St at Mission Bay Blvd S | Beale St at Harrison St | 2758 | Subscriber | 1982.0 | Male | No | 11.650000 |
| 139514 | 1063 | 2019-02-07 20:47:15.498 | 2019-02-07 21:04:58.895 | 23rd St at Tennessee St | 20th St at Bryant St | 5145 | Subscriber | 1985.0 | Male | No | 17.716667 |
| 181947 | 839 | 2019-02-01 08:37:20.847 | 2019-02-01 08:51:20.561 | Telegraph Ave at 19th St | Emeryville Public Market | 4476 | Subscriber | 1988.0 | Male | No | 13.983333 |
bike_19.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_name 197 end_station_name 197 bike_id 0 user_type 0 member_birth_year 8265 member_gender 8265 bike_share_for_all_trip 0 duration_min 0 dtype: int64
## remove null values from start_station_name and end_station_name
bike_19.dropna(subset='start_station_name',inplace=True)
bike_19.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_name 0 end_station_name 0 bike_id 0 user_type 0 member_birth_year 8263 member_gender 8263 bike_share_for_all_trip 0 duration_min 0 dtype: int64
## add new column member_age by to specify the member age
bike_19['member_age']=2019-bike_19['member_birth_year']
bike_19.sample(5)
| duration_sec | start_time | end_time | start_station_name | end_station_name | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | duration_min | member_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 91964 | 1827 | 2019-02-15 20:35:56.273 | 2019-02-15 21:06:23.341 | 4th St at 16th St | 24th St at Chattanooga St | 2567 | Subscriber | 1996.0 | Female | No | 30.450000 | 23.0 |
| 152138 | 1207 | 2019-02-06 16:12:01.999 | 2019-02-06 16:32:09.506 | Montgomery St BART Station (Market St at 2nd St) | 17th St at Valencia St | 3284 | Subscriber | NaN | NaN | No | 20.116667 | NaN |
| 90294 | 632 | 2019-02-16 13:04:57.632 | 2019-02-16 13:15:30.303 | Market St at Franklin St | Market St at Dolores St | 1596 | Customer | 1984.0 | Other | No | 10.533333 | 35.0 |
| 151282 | 732 | 2019-02-06 17:19:40.051 | 2019-02-06 17:31:52.929 | Union Square (Powell St at Post St) | Rhode Island St at 17th St | 4626 | Subscriber | 1970.0 | Male | No | 12.200000 | 49.0 |
| 38283 | 767 | 2019-02-23 12:15:18.239 | 2019-02-23 12:28:05.466 | Union Square (Powell St at Post St) | Laguna St at Hayes St | 6292 | Subscriber | 1995.0 | Male | Yes | 12.783333 | 24.0 |
bike_19['member_age'].value_counts()
31.0 10214
26.0 9323
30.0 8967
29.0 8640
28.0 8484
...
91.0 1
141.0 1
89.0 1
109.0 1
92.0 1
Name: member_age, Length: 75, dtype: int64
px.bar(data_frame=bike_19['member_age'].value_counts(),title="member age")
px.histogram(data_frame=bike_19,x='member_age',title="member age")
## as we see there is lot of outliers ,i should remove it i think 70 years is enough
bike_19.drop('member_birth_year', axis=1, inplace=True)
bike_19.sample(10)
| duration_sec | start_time | end_time | start_station_name | end_station_name | bike_id | user_type | member_gender | bike_share_for_all_trip | duration_min | member_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 10739 | 596 | 2019-02-27 19:43:17.045 | 2019-02-27 19:53:13.312 | Valencia St at Clinton Park | Garfield Square (25th St at Harrison St) | 5469 | Subscriber | Male | No | 9.933333 | 23.0 |
| 68222 | 789 | 2019-02-20 05:21:28.576 | 2019-02-20 05:34:37.715 | McAllister St at Baker St | 1st St at Folsom St | 6516 | Subscriber | Female | No | 13.150000 | 31.0 |
| 64179 | 787 | 2019-02-20 12:40:36.858 | 2019-02-20 12:53:44.187 | Doyle St at 59th St | Doyle St at 59th St | 828 | Customer | NaN | No | 13.116667 | NaN |
| 71084 | 528 | 2019-02-19 17:44:42.828 | 2019-02-19 17:53:31.297 | San Francisco Caltrain (Townsend St at 4th St) | Bryant St at 15th St | 4624 | Subscriber | Male | No | 8.800000 | 30.0 |
| 132217 | 672 | 2019-02-09 09:01:07.710 | 2019-02-09 09:12:20.300 | West St at 40th St | Telegraph Ave at 19th St | 3131 | Subscriber | Male | No | 11.200000 | 29.0 |
| 119244 | 145 | 2019-02-11 18:06:43.106 | 2019-02-11 18:09:08.939 | 11th St at Bryant St | Division St at Potrero Ave | 1809 | Subscriber | Male | No | 2.416667 | 36.0 |
| 73382 | 1069 | 2019-02-19 13:47:11.539 | 2019-02-19 14:05:01.535 | S Van Ness Ave at Market St | The Embarcadero at Sansome St | 5335 | Subscriber | Male | No | 17.816667 | 28.0 |
| 179244 | 457 | 2019-02-01 14:43:03.081 | 2019-02-01 14:50:40.523 | Union Square (Powell St at Post St) | Townsend St at 5th St | 4445 | Subscriber | Male | No | 7.616667 | 23.0 |
| 85440 | 2514 | 2019-02-17 14:37:58.646 | 2019-02-17 15:19:53.523 | Esprit Park | Lombard St at Columbus Ave | 5874 | Subscriber | Female | No | 41.900000 | 25.0 |
| 163488 | 977 | 2019-02-05 09:06:38.631 | 2019-02-05 09:22:56.546 | Webster St at Grove St | Hubbell St at 16th St | 1755 | Subscriber | Male | No | 16.283333 | 46.0 |
bike_19=bike_19.query("member_age<=70")
px.histogram(data_frame=bike_19,x='member_age',title="member age")
px.box(data_frame=bike_19,y='member_age',title="member age")
bike_19['member_age'].describe()
count 174377.000000 mean 34.038876 std 9.714521 min 18.000000 25% 27.000000 50% 32.000000 75% 39.000000 max 70.000000 Name: member_age, dtype: float64
bike_19.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_name 0 end_station_name 0 bike_id 0 user_type 0 member_gender 0 bike_share_for_all_trip 0 duration_min 0 member_age 0 dtype: int64
bike_19.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174377 entries, 0 to 183411 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174377 non-null int64 1 start_time 174377 non-null datetime64[ns] 2 end_time 174377 non-null datetime64[ns] 3 start_station_name 174377 non-null object 4 end_station_name 174377 non-null object 5 bike_id 174377 non-null object 6 user_type 174377 non-null object 7 member_gender 174377 non-null object 8 bike_share_for_all_trip 174377 non-null object 9 duration_min 174377 non-null float64 10 member_age 174377 non-null float64 dtypes: datetime64[ns](2), float64(2), int64(1), object(6) memory usage: 16.0+ MB
## add new column s_day,s_hour
bike_19["s_day"] = bike_19["start_time"].dt.day_name()
bike_19["s_hour"] = bike_19["start_time"].dt.hour
bike_19.sample()
| duration_sec | start_time | end_time | start_station_name | end_station_name | bike_id | user_type | member_gender | bike_share_for_all_trip | duration_min | member_age | s_day | s_hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 74621 | 358 | 2019-02-19 10:01:24.636 | 2019-02-19 10:07:22.641 | Haste St at Telegraph Ave | University Ave at Oxford St | 6444 | Subscriber | Male | Yes | 5.966667 | 22.0 | Tuesday | 10 |
bike_19.dtypes
duration_sec int64 start_time datetime64[ns] end_time datetime64[ns] start_station_name object end_station_name object bike_id object user_type object member_gender object bike_share_for_all_trip object duration_min float64 member_age float64 s_day object s_hour int64 dtype: object
bike_19.drop('duration_sec', axis=1, inplace=True)
bike_19['duration_min']=bike_19['duration_min'].astype(int)
bike_19['member_age']=bike_19['member_age'].astype(int)
bike_19.dtypes
start_time datetime64[ns] end_time datetime64[ns] start_station_name object end_station_name object bike_id object user_type object member_gender object bike_share_for_all_trip object duration_min int32 member_age int32 s_day object s_hour int64 dtype: object
bike_19.set_index('bike_id', inplace=True)
bike_19.sample()
| start_time | end_time | start_station_name | end_station_name | user_type | member_gender | bike_share_for_all_trip | duration_min | member_age | s_day | s_hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| bike_id | |||||||||||
| 4759 | 2019-02-12 11:09:01.071 | 2019-02-12 11:22:34.881 | Steuart St at Market St | Bryant St at 6th St | Subscriber | Male | No | 13 | 36 | Tuesday | 11 |
## store the cleand data to csv file
bike_19.to_csv("cleand_fordgobike-tripdata.csv",index=False)
bike_19=pd.read_csv("cleand_fordgobike-tripdata.csv")
The original combined data set contains approximately 183,412 individual trip records. There are 16 variables in this data set, which can be divided into three major categories:
Trip duration: This category includes the duration_sec, start_time, and end_time variables. These variables provide information about the length of each trip, the time the trip started, and the time the trip ended. Station information: This category includes the start_station_id, start_station_name, start_station_latitude, start_station_longitude, end_station_id, end_station_name, end_station_latitude, and end_station_longitude variables. These variables provide information about the start and end stations for each trip. Member information (anonymized): This category includes the bike_id, user_type, member_birth_year, member_gender, and bike_share_for_all_trip variables. These variables provide information about the bike used for each trip, the user type (member or casual rider), the member's birth year, the member's gender, and whether the member has a bike share pass for all trips. In addition to the original variables, the following derived features were created to assist with exploration and analysis:
Trip information: The duration_min variable was created by dividing the duration_sec variable by 60. The s_day, and s_hour, were created by extracting the corresponding information from the start_time variable. Member: The member_age variable was created by calculating the age of the member based on their birth year.
I am interested in exploring the patterns of bike trips in terms of their duration and rental events, and how these patterns relate to the riders' characteristics, such as their user type, gender, and age. i want to get a sense of how and what people are using the bike-sharing service for.
Some specific questions you want to answer include:
When are most trips taken? In terms of time of day, day of the week? How long does the average trip take? Do these patterns differ depending on whether the user is a subscriber or a customer? I can answer these questions by conducting a statistical analysis of the data. I can use descriptive statistics to summarize the data,
For example, I could use a histogram to visualize the distribution of trip durations. I could also describe the difference in the average trip duration between subscribers and customers.
By analyzing the data in this way, I can gain a deeper understanding of how people are using the bike-sharing service. This information can be used to improve the service and make it more user-friendly.
Here are some additional questions I could explore:
What are the most popular start and end stations? Do people tend to take longer trips during the weekend? Are there any differences in trip patterns between men and women? Do younger people tend to take shorter trips? By asking these questions and exploring the data, I can learn more about how people are using the bike-sharing service and how it can be improved.
The start date/time and duration of each trip can be used to understand how long a trip typically takes and when it is most likely to occur. The user information, such as user type, gender, and age, can be used to identify the main target customer groups. By summarizing the bike usage data for different groups of riders, we can see if there are any special patterns associated with a specific group.
For example, we might find that subscribers tend to take longer trips than customers, or that men tend to take more trips than women. We might also find that younger people tend to take shorter trips than older people.
This information can be used to improve the bike-sharing service by targeting different groups of riders with different marketing messages. For example, we might target subscribers with messages about longer trips, or we might target women with messages about safety.
Here are some specific questions you could ask:
What is the average trip duration for subscribers? For customers? What are the most popular times of day for trips? For weekdays? For weekends? What are the most popular start and end stations? Do men or women take more trips? Do younger or older people take more trips? Do subscribers or customers take longer trips? By asking these questions and exploring the data, you can learn more about how people are using the bike-sharing service and how it can be improved.
bike_19.sample(7)
| start_time | end_time | start_station_name | end_station_name | user_type | member_gender | bike_share_for_all_trip | duration_min | member_age | s_day | s_hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 151834 | 2019-02-05 17:44:39.038 | 2019-02-05 17:53:52.199 | Powell St BART Station (Market St at 5th St) | 8th St at Ringold St | Subscriber | Male | No | 9 | 38 | Tuesday | 17 |
| 174070 | 2019-02-01 07:08:00.713 | 2019-02-01 07:18:22.040 | Howard St at Beale St | The Embarcadero at Vallejo St | Subscriber | Female | No | 10 | 45 | Friday | 7 |
| 85057 | 2019-02-16 15:05:39.221 | 2019-02-16 15:14:50.254 | Hearst Ave at Euclid Ave | North Berkeley BART Station | Subscriber | Male | No | 9 | 35 | Saturday | 15 |
| 108670 | 2019-02-12 09:44:18.766 | 2019-02-12 09:50:57.895 | Vine St at Shattuck Ave | Hearst Ave at Euclid Ave | Subscriber | Male | No | 6 | 30 | Tuesday | 9 |
| 11318 | 2019-02-27 18:19:51.308 | 2019-02-27 18:22:40.937 | Harrison St at 17th St | 19th St at Florida St | Subscriber | Male | No | 2 | 39 | Wednesday | 18 |
| 154262 | 2019-02-05 12:28:21.080 | 2019-02-05 12:29:41.289 | 4th St at 16th St | 4th St at Mission Bay Blvd S | Subscriber | Male | No | 1 | 20 | Tuesday | 12 |
| 11554 | 2019-02-27 18:05:38.696 | 2019-02-27 18:07:38.393 | Frank H Ogawa Plaza | MLK Jr Way at 14th St | Subscriber | Male | No | 1 | 31 | Wednesday | 18 |
bike_19.describe()
| duration_min | member_age | s_hour | |
|---|---|---|---|
| count | 174377.000000 | 174377.000000 | 174377.000000 |
| mean | 11.245692 | 34.038876 | 13.458260 |
| std | 27.404801 | 9.714521 | 4.736039 |
| min | 1.000000 | 18.000000 | 0.000000 |
| 25% | 5.000000 | 27.000000 | 9.000000 |
| 50% | 8.000000 | 32.000000 | 14.000000 |
| 75% | 13.000000 | 39.000000 | 17.000000 |
| max | 1409.000000 | 70.000000 | 23.000000 |
## creating a function that carries the visualizations titles,x_label and y_label
def plot_label(title,x_label,y_label):
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.xticks(rotation=45)
## How does the trip distribution vary by day of week?
color_base=sns.color_palette()[0]
sns.countplot(data=bike_19,x='s_hour',color=color_base)
plot_label("Trip Start Hour Of The Day","Hour Of Day","Count")
plt.show();
## What are the most common destinations for trips on each day of the week?
days = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
sns.countplot(data=bike_19,x='s_day',color=color_base,order=days)
plot_label("Trip Start Day Of The Week ","Day Of The Week","Count")
## What is the characteristics of the riders and members to better understand their composition?
sns.countplot(data=bike_19,x='user_type',color=color_base)
plot_label("Member Type","User Type","Count");
sns.countplot(data=bike_19,x='member_gender',color=color_base)
plot_label("Member Gender","Gender","Count")
sns.countplot(data=bike_19,x='bike_share_for_all_trip',color=color_base)
plot_label("Bike Share Trip","Bike Share For All Trip","Count");
bins=np.arange(0,bike_19["member_age"].max()+5,5)
plt.hist(data=bike_19,x="member_age",bins=bins)
plot_label("Member Age","Age","count")
bike_19["duration_min"].describe([.99])
count 174377.000000 mean 11.245692 std 27.404801 min 1.000000 50% 8.000000 99% 52.000000 max 1409.000000 Name: duration_min, dtype: float64
fig = px.histogram(
data_frame=bike_19,
x='duration_min',
title='Distribution of Trip Durations'
)
fig.update_layout(
xaxis_title='Duration (minutes)',
yaxis_title='Count',
xaxis_range=[0, 70]
)
fig.show()
The number of trips peaked around 8-9 am and 5-6 pm on work days (Mon-Fri) compared to weekends.
In terms of user demographics, there were more male riders than female, and most members were subscribers compared to casual riders. The majority of the members did not use bike share for all of their trips and were between 20 and 40 years old.
Most rides were quick and short, lasting between 5 to 10 minutes. However, there were some very long outliers like 24 hours. Luckily, no transformation was needed due to the straightforwardness of the data.
To focus on the typical use of the system, the outlier long trip records were filtered out using .query(). The distribution of trip duration was initially unclear but became much clearer after using .describe() and xticks() to zoom in to a smaller range where most trip records fell.
In this section, investigate relationships between pairs of variables in your data. Make sure the variables that you cover here have been introduced in some fashion in the previous section (univariate exploration).
### How does the trip duration distribution differ between customers and subscribers?
sns.violinplot(data=bike_19.query('duration_min<=60'), x='user_type', y='duration_min', color=color_base,inner='quartail')
plot_label("trip duration distribution",'User Type','Trip Duration in Minutes')
### How does the trip duration distribution differ by gender?
fig = px.box(
data_frame=bike_19.query('duration_min<=60'),
x='member_gender',
y='duration_min',
title='Distribution of Trip Durations by Gender'
)
fig.update_layout(
xaxis_title='Gender',
yaxis_title='Duration (minutes)'
)
fig.show()
#### Is the average duration of a trip different for different days of the week? If so,
#### which days have the longest and shortest average durations?
sns.barplot(data=bike_19,
x="s_day",
y="duration_min",
order=days,
color=color_base)
plot_label("Trip Duration by Day of Week","Day of Week",'Average Trip Duration (minutes)')
#### What is the average age of members on the week?
sns.barplot(data=bike_19,x="s_day",y="member_age",color=color_base,order=days)
plot_label("AVG member age due to the day of the week","Day Of The Week","Age");
#### What is the distribution of trips by hour of the day for Coustomer users and Subscribe users?
sns.countplot(data=bike_19,x='s_hour',hue='user_type')
plot_label("Hour of the day vs user_type","Hour Of Day","Count");
### What is the distribution of trips by Day of the week for Coustomer users and Subscribe users?
sns.countplot(data=bike_19,x='s_day',hue='user_type',order=days)
plot_label("weekly day vs user_type","User Type","Count");
### What is the distribution of Age of Coustomer users and Subscribe users?
fig = px.box(data_frame=bike_19, x="user_type", y="member_age")
fig.update_layout(
title="Member Age Distribution by User Type",
yaxis_title="Member Age"
)
fig.show()
The bike-sharing system is used more by subscribers than customers. Subscribers tend to use the system for work commutes, resulting in most trips being on work days (Mon-Fri) and especially during rush hours (when going to work in the morning and getting off work in the afternoon). On the other hand, customers tend to ride for fun in the afternoon or early evenings over weekends.
Subscribers are slightly older than customers on average but take much shorter/quicker rides.
Create plots of three or more variables to investigate your data even further. Make sure that your investigations are justified, and follow from your work in the previous sections.
### How does the average trip duration vary between customers and subscribers on weekdays?
sns.pointplot(data=bike_19,x="s_day",y="duration_min",hue="user_type",dodge=.3,order=days)
plot_label("the average trip duration vary between customers and subscribers on weekdays","s_day","AVG Trip Duration In Minute ")
The multivariate exploration strengthened some of the patterns discovered in the previous bivariate exploration as well as univariate exploration. The relationship between the multiple variables plotted is visualized altogether and information is presented combined. The efficient/short period of usage for subscribers corresponds to their high concentration on rush hours Monday through Friday, indicating that the use is primarily for work commutes. The more relaxing and flexible pattern of customer use shows that they’re taking advantage of the bike-sharing system quite differently from the subscribers, heavily over weekends and in the afternoon, probably for city tours or leisure purposes.
The interactions between features are all supplementing each other and make sense when looked at combined; there’s no big surprise observed. The usage habit difference between male and females is not that much or obvious throughout the exploration, which could be related to the imbalanced number of female riders/records compared to male ones. It would be interesting to see how males and females use the system differently if there were more female data.